setwd(‘/Users/ajdonna/Desktop/CoursesNpractice/FaceBook Data’)
FB_Year <- read.csv('Yearly_FB_data.csv')
summary(FB_Year)
## YEAR Mean_Likes Median_Likes Mean_Comments
## Min. :2011 Min. : 4.00 Min. : 4.000 Min. :0.000
## 1st Qu.:2012 1st Qu.: 7.00 1st Qu.: 4.000 1st Qu.:1.000
## Median :2014 Median :13.00 Median : 8.000 Median :2.000
## Mean :2014 Mean :16.57 Mean : 9.571 Mean :2.429
## 3rd Qu.:2016 3rd Qu.:27.50 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :2017 Max. :30.00 Max. :19.000 Max. :5.000
## Median_Comments Likes Comments Posts
## Min. :0.0000 Min. : 62 Min. : 12.0 Min. : 14.0
## 1st Qu.:0.0000 1st Qu.:1002 1st Qu.:140.0 1st Qu.: 72.5
## Median :0.0000 Median :2856 Median :186.0 Median :125.0
## Mean :0.8571 Mean :2314 Mean :316.6 Mean :151.1
## 3rd Qu.:1.5000 3rd Qu.:3373 3rd Qu.:463.0 3rd Qu.:183.0
## Max. :3.0000 Max. :4534 Max. :812.0 Max. :408.0
## TYPE_VIDEO TYPE_LINK TYPE_PHOTO TYPE_STATUS
## Min. : 0.00 Min. : 0.00 Min. : 5.00 Min. : 9.00
## 1st Qu.: 0.50 1st Qu.:11.00 1st Qu.:33.50 1st Qu.: 11.00
## Median : 1.00 Median :23.00 Median :49.00 Median : 18.00
## Mean : 53.57 Mean :19.14 Mean :42.57 Mean : 35.71
## 3rd Qu.: 26.50 3rd Qu.:27.00 3rd Qu.:56.50 3rd Qu.: 38.50
## Max. :320.00 Max. :35.00 Max. :64.00 Max. :124.00
str(FB_Year)
## 'data.frame': 7 obs. of 12 variables:
## $ YEAR : int 2011 2012 2013 2014 2015 2016 2017
## $ Mean_Likes : int 4 5 13 25 30 30 9
## $ Median_Likes : int 4 4 8 19 19 9 4
## $ Mean_Comments : int 1 1 4 5 4 2 0
## $ Median_Comments: int 0 0 1 3 2 0 0
## $ Likes : int 62 506 2856 3162 1497 4534 3584
## $ Comments : int 12 132 812 661 186 265 148
## $ Posts : int 14 95 217 125 50 149 408
## $ TYPE_VIDEO : int 0 0 1 1 6 47 320
## $ TYPE_LINK : int 0 16 28 23 6 35 26
## $ TYPE_PHOTO : int 5 47 64 55 20 58 49
## $ TYPE_STATUS : int 9 32 124 45 18 9 13
FB_Year$f_YEAR <- factor(FB_Year$YEAR)
library(ggplot2)
ggplot(data = FB_Year, aes(YEAR, Posts))+
geom_line()+
scale_x_continuous(breaks= seq(2011,2017,1))+
scale_y_continuous(breaks= seq(0,800,100))+
geom_line(data = FB_Year,aes(YEAR, Comments),color = 'blue')
ggplot(data = FB_Year,aes(YEAR, Mean_Likes))+
geom_line()+
ylab("Mean_Likes/Mean_Comments")+
scale_y_continuous(breaks=seq(0,30,5))+
scale_x_continuous(breaks= seq(2011,2017,1))+
geom_line(data = FB_Year,aes(YEAR, Mean_Comments),color = 'red')
ggplot(data = FB_Year,aes(YEAR, log2(Comments)))+
geom_line()+
scale_x_continuous(breaks= seq(2011,2017,1))+
geom_line(data = FB_Year,aes(YEAR, Mean_Comments),color = 'red')+
geom_line(data = FB_Year,aes(YEAR, Median_Comments),color = 'blue')
FB <- read.csv('FBposts_Words.csv')
str(FB)
## 'data.frame': 1075 obs. of 21 variables:
## $ ID : Factor w/ 1073 levels "1405174859572227_1000278100061907",..: 775 775 572 586 585 583 580 579 578 576 ...
## $ DAY : int 31 31 30 29 24 8 25 25 25 19 ...
## $ MONTH : int 12 12 12 12 12 12 10 10 10 10 ...
## $ YEAR : int 2010 2010 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ DATE : Factor w/ 577 levels "1/1/13","1/1/15",..: 171 171 169 164 158 179 85 85 85 74 ...
## $ HOUR : int 20 20 5 3 9 7 8 8 8 9 ...
## $ MIN : int 0 0 29 54 33 59 40 36 7 39 ...
## $ SEC : int 0 0 49 29 4 4 52 27 44 43 ...
## $ TIME : Factor w/ 1060 levels "00:00:59","00:02:16",..: 925 925 166 88 322 279 305 303 285 324 ...
## $ TYPE : Factor w/ 5 levels "link","note",..: 1 1 3 3 4 3 4 3 4 4 ...
## $ LIKES : int 0 0 5 1 5 0 0 3 6 3 ...
## $ COMMENTS: int 0 0 0 0 1 0 0 0 2 3 ...
## $ POS : num 0 0 0.667 0 0.045 0 0 0 0.355 0.213 ...
## $ NEG : num 0 0 0 0 0.138 0 0 0 0.041 0 ...
## $ NEU : num 0 0 0.333 0 0.817 0 0 0 0.604 0.787 ...
## $ COMP : num 0 0 0.612 0 -0.964 0 0 0 0.96 0.852 ...
## $ WORDS : int 0 0 0 0 94 0 0 0 22 24 ...
## $ X : int NA 0 NA NA NA NA NA NA NA NA ...
## $ X.1 : int NA 0 NA NA NA NA NA NA NA NA ...
## $ X.2 : int NA 0 NA NA NA NA NA NA NA NA ...
## $ X.3 : int NA 0 NA NA NA NA NA NA NA NA ...
FB$fYear<- factor(FB$YEAR)
FB$fMonth<- factor(FB$MONTH)
summary(FB$COMMENTS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 2.085 2.000 70.000
summary(FB$LIKES)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 3.00 6.00 15.37 15.00 233.00
ggplot(data=subset(FB,YEAR!=2010),aes(COMMENTS, LIKES))+
geom_point(aes(color = fYear))+
scale_x_continuous(breaks=seq(0,20,2),limits = c(0,20))+
scale_y_continuous(limits = c(0,125))
## Warning: Removed 21 rows containing missing values (geom_point).
Notes: From the graph it is clear that the posts from Latest Years are the ones to garner more no. of likes as compared to the posts to have received high number of Comments which are from Early Years!
ggplot(data=subset(FB,YEAR!=2010),aes(COMMENTS, LIKES))+
geom_point(aes(color = fYear, size=WORDS))+
scale_x_continuous(breaks=seq(0,20,2),limits = c(0,20))+
scale_y_continuous(limits = c(0,125))
## Warning: Removed 21 rows containing missing values (geom_point).
FB$Words_Level <- factor(cut(FB$WORDS, breaks = c(-1,25,50,75,100,226), labels = c("Vlow","Low","Med","High","Vhigh")))
ggplot(data=subset(FB,YEAR!=2010),aes(COMMENTS, LIKES))+
geom_point(aes(color = Words_Level))+
facet_wrap(~YEAR,scales="free")
Notes: It appears that beginning from year 2013, the posts with VERY LOW words(<25) or LOW(<50) have been getting more LIKES as compared to COMMENTS, with some outliers here and there in the form of words with more number of WORDS!
FB$Week <- cut(FB$DAY, breaks = c(0,8,15,22,28,31), labels = 1:5)
ggplot(data=subset(FB,YEAR!=2010),aes(Week))+
geom_histogram(stat="count",color="black",fill="#678912")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
Notes: From this plot, it becomes evident that No. of posts spread across Weeks of a month are similar in distribution except for the week 5! Let’s see if the distribution is same across all the years.
ggplot(data=subset(FB,YEAR!=2010),aes(Week))+
geom_histogram(stat="count",color="black",fill="#678912")+
facet_wrap(~YEAR, scales="free")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
Notes: From 2012 through 2016, the maximum no. of posts were shared during the first week of the month, with approximately same number of posts shared across the weeks except the last week(5th). But it is the scale of No. of Posts which differs considerably from one year to another. Year 2013’s scale is almost double to that in year 2012, 2014 and 2016. But Year 2017, the scale has already taken a 4 times jump!
ggplot(data=subset(FB,YEAR!=2010 & TYPE!='note'),aes(Week))+
geom_histogram(stat="count",aes(fill=TYPE),color="black")+
facet_wrap(~YEAR, scales="free")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
Notes: For Year 2012-> Photo, 2013-> Status, 2014-> Status & Photo, 2015 -> Status&Photo, 2017 -> Video. It is clear that it was in 2015 that considerable number of posts were shared across all the 4 categories. Prior to that it was either a clash between Photo and status in terms of more number of posts or either one of them dominated.
summary(FB$WORDS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 0.0 16.8 23.0 226.0
ggplot(data=subset(FB, FB$YEAR!='2010'),aes(fYear, WORDS))+
geom_boxplot()+
ylim(0,quantile(FB$WORDS,0.98))
## Warning: Removed 22 rows containing non-finite values (stat_boxplot).
ggplot(data=subset(FB, FB$YEAR!='2010'),aes(Words_Level, LIKES))+
geom_boxplot()+
ylim(0,quantile(FB$LIKES,0.95))
## Warning: Removed 54 rows containing non-finite values (stat_boxplot).
Notes: Except for the “Very LOW” category of words, all the other categories seem to have same number of Median Likes except for the VERY HIGH category of words(100+), which has the highest median as compared to all others
ggplot(data=subset(FB, FB$YEAR!='2010'),aes(Words_Level, LIKES))+
geom_boxplot()+
ylim(0,quantile(FB$LIKES,0.95))+
facet_wrap(~YEAR, scales="free")
## Warning: Removed 54 rows containing non-finite values (stat_boxplot).
Notes: 2012,2013 is the only year where there has been a consistent increase in the median LIKES with increase in number of words. 2014,2015 and 2016 are the 3 years, which register the Highest Median Likes for the LOW number of words used in the posts! In 2015-LOW, the Median Likes reach its zenith, close to 50. 2017 is the only year when there is an consistency in the number of likes across all the WORD Categories!
ggplot(data=subset(FB,YEAR!=2010),aes(Words_Level))+
xlab("Number of Words")+
geom_histogram(stat="count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplot(data=subset(FB,YEAR!=2010),aes(Words_Level))+
xlab("Number of Words")+
geom_histogram(stat="count")+
facet_wrap(~YEAR,scales = "free")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
Notes: For every year, the highest number of posts fall in the cateogory of having used Very Low number of Words! For 2017, the distribution is very sparse across the categories with number of POSTS very low words being Alarmingly High. That’s probably the reason there was a Increase in the Median Likes for year 2017! Less No of POSTS!
ggplot(data=subset(FB, YEAR!=2010 & TYPE!='note'),aes(TYPE))+
geom_histogram(stat="count",color="black",fill="#34A389")+
facet_wrap(~YEAR,scales="free")+
xlab("Status Type")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
Notes: Importantly, there has been a gradual increase in the Video Status being shared with the years.The VIDEO status being shared began in 2013 and it increased steadily with zenith currently in 2017! The distribution across all other categories has been more or less the same throughout the years except for 2015 which registered a serious dip in the overall statuses and 2013 which saw a significant increase in the status, reaching to a maximum of 120.
ggplot(data=subset(FB, YEAR!=2010 & TYPE!='note'),aes(YEAR))+
geom_freqpoly(stat="count",aes(color=TYPE))+
ylab("POSTS")+
scale_x_continuous(breaks=seq(2011,2017,1))+
scale_y_continuous(breaks = seq(0,300,50))
Notes: As visible, there is a steep increase in the number of Videos being shared in year 2017
ggplot(data=subset(FB,YEAR!=2010 & TYPE!='note'),aes(WORDS,LIKES))+
geom_point(aes(color=TYPE))+
scale_y_continuous(breaks=seq(0,125,25),limits=c(0,125))+
scale_x_continuous(breaks=seq(0,150,25),limits=c(0,150))+
geom_hline(yintercept = mean(FB$LIKES),color='black')+
geom_hline(yintercept = median(FB$LIKES),color='red')
## Warning: Removed 11 rows containing missing values (geom_point).
Notes: The PHOTO Status are way ahead in generating LIKES followed by STATUS and Likes for LINK and VIDEO.
ggplot(data=subset(FB,YEAR!=2010 & TYPE!='note'),aes(WORDS,LIKES))+
geom_point(aes(color=TYPE))+
geom_hline(yintercept = mean(FB$LIKES),color='black')+
geom_hline(yintercept = median(FB$LIKES),color='red')+
facet_wrap(~YEAR,scales="free")
Notes: Distribution of Likes is same as that of Distribution of Comments
by(FB$Words_Level, FB$YEAR, summary)
## FB$YEAR: 2010
## Vlow Low Med High Vhigh
## 2 0 0 0 0
## --------------------------------------------------------
## FB$YEAR: 2011
## Vlow Low Med High Vhigh
## 10 2 0 2 0
## --------------------------------------------------------
## FB$YEAR: 2012
## Vlow Low Med High Vhigh
## 65 7 8 10 5
## --------------------------------------------------------
## FB$YEAR: 2013
## Vlow Low Med High Vhigh
## 135 49 25 6 2
## --------------------------------------------------------
## FB$YEAR: 2014
## Vlow Low Med High Vhigh
## 76 14 14 15 7
## --------------------------------------------------------
## FB$YEAR: 2015
## Vlow Low Med High Vhigh
## 32 5 5 1 7
## --------------------------------------------------------
## FB$YEAR: 2016
## Vlow Low Med High Vhigh
## 113 16 5 8 7
## --------------------------------------------------------
## FB$YEAR: 2017
## Vlow Low Med High Vhigh
## 391 10 11 6 4
ggplot(data=subset(FB, FB$YEAR!='2010'),aes(Words_Level, COMMENTS))+
geom_boxplot()+
ylim(0,quantile(FB$COMMENTS,0.98))
## Warning: Removed 22 rows containing non-finite values (stat_boxplot).
Notes: The Median comments are pretty same for every word category except when the number of Words are Very High, the Median comments shoot to 5 varying from 2.5 otherwise!
by(FB$COMMENTS,FB$Words_Level, summary)
## FB$Words_Level: Vlow
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 1.067 1.000 70.000
## --------------------------------------------------------
## FB$Words_Level: Low
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 3.000 4.214 6.500 29.000
## --------------------------------------------------------
## FB$Words_Level: Med
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 4.000 6.044 8.000 46.000
## --------------------------------------------------------
## FB$Words_Level: High
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 3.000 5.562 7.000 48.000
## --------------------------------------------------------
## FB$Words_Level: Vhigh
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.750 5.000 7.812 9.500 38.000
ggplot(data=subset(FB, FB$YEAR!='2010'),aes(Words_Level, COMMENTS))+
geom_boxplot()+
ylim(0,quantile(FB$COMMENTS,0.98))+
facet_wrap(~YEAR,scales="free")
## Warning: Removed 22 rows containing non-finite values (stat_boxplot).
Notes: The range of number of comments is same for all the years as evident by the same Y-scale across all the years. 2014 is the only year when there is considerable number of median comments spread across all Word-Categories! The highest value of Median Comments happens in 2015 for the High category! Let’s see what’s the reason behind the nice Comment trends in year 2014!
ggplot(data=subset(FB,YEAR!=2010),aes(Words_Level))+
xlab("Number of Words")+
geom_histogram(stat="count",color='black',fill='#565689')+
facet_wrap(~YEAR,scales="free")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
FB$TYPE <- as.character(FB$TYPE)
ggplot(data=subset(FB,YEAR!=2010 & TYPE!='note'),aes(WORDS,COMMENTS))+
geom_point(aes(color=TYPE))+
scale_y_continuous(breaks=seq(0,40,10),limits=c(0,40))+
scale_x_continuous(breaks=seq(0,150,25),limits=c(0,150))+
geom_hline(yintercept = mean(FB$COMMENTS),color='black')+
geom_hline(yintercept = median(FB$COMMENTS),color='red')
## Warning: Removed 6 rows containing missing values (geom_point).
Notes: It is visible that High number of Comments are mostly attributed to Posts of TYPE:STATUS. The POSTS of type LINK, VIDEO are not the ones to have got many Comments! Let’s see the above distribution across the Years
ggplot(data=subset(FB,YEAR!=2010 & TYPE!='note'),aes(WORDS,COMMENTS))+
geom_point(aes(color=TYPE))+
facet_wrap(~YEAR,scales="free")+
geom_hline(yintercept = mean(FB$COMMENTS),color='black')+
geom_hline(yintercept = median(FB$COMMENTS),color='red')
Notes: It shows that Year 2013,2014 and 2015 were all about STATUSES with POSTS varying in length from too Few words to too Many! FOr 2016 and 2017, it were the PHOTOS to have received majority of comments! Even though, the Number of Videos were significantly higher than photos in 2017 and comparable in 2016.
ggplot(data=subset(FB,YEAR!=2010 &YEAR!=2011 & TYPE!='note'),aes(MONTH,LIKES))+
geom_point(aes(color=TYPE, size=WORDS))+
scale_x_continuous(breaks=seq(1,12,1))+
ylim(0,quantile(FB$LIKES,0.98))+
geom_freqpoly(stat='summary',fun.y=mean)+
facet_wrap(~YEAR,scales="free")
## Warning: Removed 21 rows containing non-finite values (stat_summary).
## Warning: Removed 21 rows containing missing values (geom_point).
Notes: This plot provides several insights into the overall process!As observed earlier, year 2013, 2014 and 2015 were dominant by STATUS with overall no. of posts being low in 2015. With the passage of years, the Number of Likes for posts with More Words has increased. From 2013 to 2016 and right into 2017, the trend is continuing. Talking about Mean LIKES, they have almost been uniform across the months except for year 2015 and 2016!Particularly in 2016, there was a steep dip in the Month of May, which was the time when I quit my job to go for further studies. And as evident, the LIKES thereafter have been marginally more than the previous ones!